import PyPDF2 as pdf2
import re
from bs4 import BeautifulSoup as bs
def extract_abstract(pdf_path):
    fhandle = open(pdf_path, 'rb')
    pdfreader = pdf2.PdfReader(fhandle)
    page1 = pdfreader.pages[0]
    cont = page1.extract_text()
    pattern = re.compile('Abstract([\s\S]+?)Introduction')
    abstr = re.findall(pattern, cont)
    abstr = abstr[0] if abstr != [] else abstr
    abstract = abstr.strip('\n').replace('-\n', '').replace('\n', ' ')
    return abstract

from modelscope import AutoTokenizer, AutoModel, snapshot_download
model_dir = snapshot_download("ZhipuAI/chatglm3-6b", revision = "v1.0.0")
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
model = AutoModel.from_pretrained(model_dir, trust_remote_code=True).half().cuda()
model = model.eval()

html_page = open('./CVPR 2023 Open Access Repository.html', 'r', encoding='utf-8').read()
soup = bs(html_page, 'lxml')
title_pdfs = soup.find_all('a')
title_pdfs = [p for p in title_pdfs if 'paper' in str(p) or 'html' in str(p)]
# 提取论文名称和 pdf文件 url
paper_pair = []
pattern = re.compile('href="([\s\S]+?)">')
for i in range(0, len(title_pdfs), 2):
    paper_title = title_pdfs[i].contents[0]
    pdf_url = re.findall(pattern, str(title_pdfs[i+1]))[0]
    pair = (paper_title,pdf_url.replace('https://openaccess.thecvf.com/content/CVPR2023/', ''))
    paper_pair.append(pair)

title_prompt = '请将该论文题目翻译为中文：'
abstract_prompt = '请使用中文简化该摘要内容：'

f = open('./cvpr2023_info_chatglm.csv', 'w+', encoding='utf-8')
f.write('论文题目(英文)\t论文题目(中文)\t摘要(英文)\t摘要简述(中文)\t主题词\t数据集\t文件链接(本地)\n')
fail_list = []

for i, pair in enumerate(paper_pair):
    try:
        abstr = extract_abstract(pair[1])
        # print(abstr)
        title_zh, _ = model.chat(tokenizer, title_prompt+pair[0], history=[])
        # print(title_zh)
        abstr_bf, history = model.chat(tokenizer, abstract_prompt+abstr, history=[])
        # print(abstr_bf)
        themes, history = model.chat(tokenizer, '根据该摘要能够总结出的主题词是什么？', history=history)
        # print(themes)
        dataset, history = model.chat(tokenizer, '该摘要中有提到数据集吗？有的话请给出数据集名称，要求仅使用一句话。', history=history)
        # print(dataset)
        f.write(f'{pair[0]}\t{title_zh}\t{abstr}\t{abstr_bf}\t{themes}\t{dataset}\t{pair[1]}\n')
        print(f'{i}/{len(paper_pair)} finish {pair[1]}')
    # if i > 4:
    #     break
    except:
        print(pair, 'fail to extract abstract')
        fail_list.append(pair)
f.close()

f = open('fail_summary.txt', 'a+', encoding='utf-8')
f.write(str(fail_list))
f.close()